import pandas as pd
import jieba
import matplotlib.pyplot as plt
from wordcloud import WordCloud

policy = pd.read_csv("policy/policy.csv")
print(policy.head())
print(policy.count())
for i in policy.values.tolist()[:5]:
    print(i)


def is_chinese(word):
    for ch in word:
        if '\u4e00' <= ch <= '\u9fff':
            return True
    return False


# 去掉停用词
def remove_stop_words(f):
    stop_words = ['p', '宋体', '方正', '仿宋', '各区县', '重庆市', '重点', '和', '的', '对', '等', '编辑', '助理',
                  'span', '楷体', 'br', '年', '月', '日', '在', '或', '微软', '雅黑', '微软雅黑',
                  '黑体']
    for stop_word in stop_words:
        f = f.replace(stop_word, '')
    return f


# 生成词云
def create_word_cloud(f):
    print('根据词频，开始生成词云!')
    f = remove_stop_words(f)
    cut_text = ""
    jieba_result = jieba.cut(f, cut_all=False, HMM=True)
    for i in jieba_result:
        if len(i) > 1:
            cut_text += i + " "
    wc = WordCloud(
        font_path="./wc.ttf",
        max_words=100,
        width=2000,
        height=1200,
    )
    print(cut_text)
    wordcloud = wc.generate(cut_text)
    # 写词云图片
    wordcloud.to_file("word_cloud_2020.jpg")
    # 显示词云文件
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()


# import re

# title = '<a helf="www.baidu.com" title="河南省">你好</a>'
# res = re.findall(r'<a.*?>(.*?)</a>', title)
# print(res)
all_word = ""
for i in policy.values.tolist()[:100]:
    if "2020" in i[17]:
        for j in i[17]:
            c = is_chinese(j)
            if c:
                all_word = all_word + j

create_word_cloud(all_word)
